# -*- coding:utf-8 -*-
from __future__ import division
from __future__ import print_function
import sys
from pyspark import SparkContext
from pyspark import  HiveContext


if __name__ == '__main__':

    out_file = sys.argv[1]

    sc = SparkContext(appName="Train gender predict model ")
    sqlContext = HiveContext(sc)

    sql = """
    select * from dw.dws_scrm_persona_customer_dim as a join dm_scrm.dm_persona_order_goods_index as b on a.yz_uid = b.yz_uid where a.gender = 1 or a.gender=2
    """
    df = sqlContext.sql(sql)

    cols = df.columns
    out_rdd = df.rdd.map(lambda x: [unicode(x[c]) for c in cols]).map(lambda x: '\001'.join(x))

    out_rdd.saveAsTextFile(out_file)
    print(cols)
    sc.stop()

